Setup

Load Packages

library(plotly)
library(readxl)
library(tidyverse)
library(mapdata)
library(gridExtra)
library(pheatmap)
options(scipen = 999)

Load Data

EUSuperstore <- read_excel("EUSuperstore.xls")

Summary of the Dataset

The summary of numeric variables in this data set is:

graph_profit <- ggplot(EUSuperstore, aes(y = Profit)) +
  geom_boxplot()
graph_sales <- ggplot(EUSuperstore, aes(y = Sales)) +
  geom_boxplot()
graph_discount <- ggplot(EUSuperstore, aes(y = Discount)) +
  geom_boxplot()
graph_quantity <- ggplot(EUSuperstore, aes(y = Quantity)) +
  geom_boxplot()
grid.arrange(graph_sales, graph_profit,graph_discount,graph_quantity, ncol = 4)

summary(EUSuperstore$Sales)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    2.955   49.462  119.355  293.809  320.709 7958.580
summary(EUSuperstore$Quantity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.777   5.000  14.000
summary(EUSuperstore$Discount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1031  0.1000  0.8500
summary(EUSuperstore$Profit)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3059.82     1.32    14.22    37.28    48.51  3979.08

Data Preparation and Cleaning

Are there any Null values?

EUSuperstore |> 
  janitor::clean_names()
## # A tibble: 10,000 x 20
##    row_id order_id order_date          ship_date           ship_mode customer_id
##     <dbl> <chr>    <dttm>              <dttm>              <chr>     <chr>      
##  1      1 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  2      2 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  3      3 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  4      4 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  5      5 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  6      6 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  7      7 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045   
##  8      8 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840   
##  9      9 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840   
## 10     10 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840   
## # ... with 9,990 more rows, and 14 more variables: customer_name <chr>,
## #   segment <chr>, city <chr>, state <chr>, country <chr>, region <chr>,
## #   product_id <chr>, category <chr>, sub_category <chr>, product_name <chr>,
## #   sales <dbl>, quantity <dbl>, discount <dbl>, profit <dbl>
is.null(EUSuperstore)
## [1] FALSE
View(EUSuperstore)

Visualization

Let’s analyze patterns in our Dataset

Category Counts of Superstore

ggplotly(
  ggplot(EUSuperstore, aes(x = Category, fill = Category)) +
    geom_bar() +
    labs(x = "Category", y = "Frequency",
         title = "Frequency of Category"))

Analysis: This bar plot shows that the superstore has more Office Supplies than Technology or Furniture.

Sub-Category Counts

ggplotly(
  ggplot(data = EUSuperstore, aes(x = `Sub-Category`, fill = `Sub-Category`)) +
    geom_bar() +
    theme(axis.text.x = element_text(angle = 90)) +
    labs(x = "SUb Category", y = "Frequency", title = "Frequency of SUb-Category")
)

Analysis: This graph demonstrates that the superstore offers a large range of Art, Binder, and Storage items in Sub-Category.

Sales vs Quantity

ggplot(data = EUSuperstore, aes(x = `Ship Mode`, y = Sales, fill = `Ship Mode`)) +
  geom_bar(stat = "identity")

Sales vs Profit

ggplot(data = EUSuperstore, aes(x = Sales, y = Profit, color =`Ship Mode`)) + geom_point()

Analysis: And hence, obviously we see more profits/loss have been availed from the standard shipment class.But, there are not higher range profits seen this feature.

Sales vs Discount

Let us see how Sales are affected if discounts are offered.

ggplot() + geom_point(data = EUSuperstore, aes(x = Discount, y = Sales, color = `Ship Mode`))

Profits vs Discount

Let’s see whether profits have been triggered if discounts have been redeemed.

ggplot() + geom_bar(data = EUSuperstore, aes(x = Discount, y = Profit, fill = `Ship Mode`), stat = "identity")

Let us see if this is the case with other segments

ggplot() + geom_bar(data = EUSuperstore, aes(x = `Sub-Category`, y = Profit, fill = Region), stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

EUSuperstore |>
  group_by(Region, `Sub-Category`) |>
  summarise(total_sales = sum(Sales)) -> plot

plot |>
  pivot_longer(Region,
               names_to = "key",
               values_to = "value") -> up_plot
ggplot(data = up_plot, aes(x = `Sub-Category`, y = total_sales, fill = value)) +
  geom_col(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45))

ggplot() + geom_bar(data = EUSuperstore, aes(x = Category, y = Sales, fill = Region), stat = "identity") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot() + geom_bar(data = EUSuperstore, aes(x = Category, y = Profit, fill = Region), stat = "identity") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot() + geom_col(data = EUSuperstore, aes(x = Sales, y = Profit, color = Category))

EUSuperstore |>
  group_by(Region) |>
  summarise(total_sales = sum(Sales)) -> pplot
pie(pplot$total_sales, labels = pplot$Region)

Finding the cost

cost_eu_superstore <- EUSuperstore |>
  mutate(cost = Sales - Profit) |>
  group_by(`Sub-Category`) |>
  summarise(total_cost = sum(cost))
ggplotly(
  ggplot(cost_eu_superstore, aes(x = `Sub-Category`, y = total_cost, fill = `Sub-Category`)) +
    geom_bar(stat = 'identity') +
    theme(axis.text.x = element_text(angle = 45)) +
    labs(x = "Sub Category", y = "Total Cost", title = "Total Costs in Each Sub Category")
)

Analysis: Copier, phone, storage, and bookshelf costs are higher for retailers.

Segmentation Count

ggplotly(
  ggplot(EUSuperstore, aes(x = Segment, fill = Segment)) +
    geom_bar() +
    labs(x = "Segment", y = "Frequency", title = "Popularity of Segment")
)

Analysis: The consumer segment is more appealing.

Calculating profit in each sub-category segment by segment

prof_segment_subcat <- EUSuperstore |>
  select(Segment, `Sub-Category`, Profit) |>
  group_by(Segment, `Sub-Category`) |>
  summarise(total_prof = sum(Profit)) |>
  pivot_longer(Segment,
               values_to = "segment")

ggplotly(
  ggplot(prof_segment_subcat, aes(x = `Sub-Category`, fill = segment, y = total_prof)) +
    geom_bar(stat = "identity", position = "dodge") +
    theme(axis.text.x = element_text(angle = 45)) +
    labs(x = "Sub Category", y = "Total Profit", title = "Total Profit of Sub Category in Each Segment")
)

Analysis: Because the profit rate of consumers in each sub-category is higher in the graph above, we may devote more attention to this sector than to Corporate and Home Office.

The impact of discounted invoice values

EUSuperstore |> 
  mutate(given_discount = Discount > 0) |>
  mutate(discount_amount = ifelse(given_discount,
                                  (Sales/(1 - Discount)) - Sales, 
                                  0)) |> 
  group_by(`Order ID`) |> 
  summarise(invoice_value = sum(Sales),
            discount_value = sum(discount_amount)) |> 
  mutate(discount_given = discount_value > 0) -> transaction_discount

ggplotly(
  ggplot(data = transaction_discount, aes(y = invoice_value, x = discount_given, color = discount_given)) +
    geom_boxplot() +
    labs(x = "Is Discount Given", y = "Invoice Value", title = "Effect of Invoice Values After Giving Discount")
)

Analysis: The discount applied to a specific item on each invoice is advantageous to us since it raises the invoice value. It’s also conceivable that the number of discounts granted in each transaction is significantly higher than the number of discounts not given. As a result, the transaction values of the discount granted will appear to grow, resulting in bias. As a result, we must examine the fraction of transactions with and without discounts.

Verification of the prior outcome

ggplotly(
  ggplot(data = transaction_discount, aes(x = discount_given, fill = discount_given)) +
    geom_bar() +
    labs(y = "Frequency", x = "Is Discount Given", title = "Count of Transaction in Which Discount Given")
)

Analysis: So our earlier boxplot analysis was right! The discount applied to a specific item on each invoice is advantageous to us since it raises the invoice value.

Relation Between Sales and Profit

ggplotly(
  ggplot(EUSuperstore, aes(x = Sales, y = Profit)) +
    geom_hex() +
    geom_smooth() +
    labs(title = "Relationship Between Sales and Profit")
)

Analysis: The Profit increases as the Sales increase.

Geographical Plot of Country-wise Sales:

EUSuperstore |>
  group_by(Country) |>
  dplyr::summarise(total_sales = sum(Sales)) |>
  arrange(desc(total_sales)) -> countrywise_sales
data.table::data.table(countrywise_sales)
##            Country total_sales
##  1:         France  858931.083
##  2:        Germany  628840.030
##  3: United Kingdom  528576.300
##  4:          Italy  289709.658
##  5:          Spain  287146.680
##  6:        Austria   81162.000
##  7:    Netherlands   77514.945
##  8:        Belgium   49226.700
##  9:         Sweden   30491.403
## 10:    Switzerland   24877.860
## 11:        Finland   20704.350
## 12:         Norway   20525.370
## 13:        Ireland   16639.509
## 14:       Portugal   15105.120
## 15:        Denmark    8638.053
worldmap <- map_data("world")
colnames(countrywise_sales) = c("region", "values")
country_map <- countrywise_sales |>
  left_join(worldmap, countrywise_sales, by = "region")
country_map <- country_map |>
  filter(!is.na(country_map$values))
centroid <- aggregate(cbind(long, lat) ~ region, data = country_map, FUN = mean)

ggplotly(
  ggplot(data = country_map, aes(x = long, y = lat)) +
  geom_polygon(aes(fill = values), alpha = 0.7) +
    geom_text(data = centroid, aes(x = long, y = lat, label = region)) +
    coord_equal() +
    labs(title = "Geographical Distribution of Sales Value")
)

Analysis: Because the value of sales in France, the United Kingdom, and Germany is higher, we should extend our product offerings in these nations.

Conclusion:

Same day shipment if receives more discounts can trigger sales/profits. Discounts should be based on the Sales and should not increase a particular range otherwise unnecessary discounts with low sales can witness huge losses. Binders and Machines industry should be focused upon more so as to strengthen these weakened industry areas.Office Supplies and the Furniture industries do not seem to boom in the Central Region.